import chardet
import requests
from bs4 import BeautifulSoup

# 目标网页URL
url = 'http://mz.guoxuelu.com/hongloumeng.htm'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
}
# 发送HTTP请求
response_main = requests.get(url)
file_name = 'red_UTF82.txt'

# 确保请求成功
if response_main.status_code == 200:
    # 解析网页内容
    soup = BeautifulSoup(response_main.text, 'html.parser')

    # 查找class为"zhengwen"的div标签
    main_div = soup.find('div', class_='zhengwen')

    # 在找到的div中查找所有的a标签
    a_tags = main_div.find_all('a') if main_div else []

    # 遍历a标签，提取href属性
    hrefs = [a.get('href') for a in a_tags]

    # 打印所有提取的href
    for href in hrefs:
        print(href)
        # 发送HTTP请求
        response = requests.get(href, headers=headers)

        # 确保请求成功
        if response.status_code == 200:
            # 使用chardet库检测编码
            encoding = chardet.detect(response.content)['encoding']

            # 使用lxml解析器和检测到的编码
            soup = BeautifulSoup(response.content, 'lxml', from_encoding=encoding)

            # 查找class为"text-center"的h5标签
            title = soup.find('div', class_='wenztit').get_text()

            # 查找所有class为"grap"的div标签
            div = soup.find('div', class_='wenzbody')

            # 删除第一个p标签
            div.find('p').extract()
            webpage_content = title + div.get_text() + '\n'
            # print(webpage_content)

            # 打开文件，准备写入
            with open(file_name, 'a', encoding='utf-8') as file:
                # 写入文本内容
                file.write(webpage_content)
                print(title + ' 内容已追加到文件中' + '\n')

        else:
            print('Failed to retrieve the webpage')

    print(f'红楼梦爬取完毕, 内容已保存到{file_name}中')

else:
    print('Failed to retrieve the webpage')